Index

Alignmnent and counting of the fastq files

This step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.

Library loading and set up

suppressMessages(
  c(library(DESeq2),
    library(limma),
    library(tximport),
    library(gridExtra),
    library(ensembldb),
    library(EnsDb.Mmusculus.v79),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(VennDiagram),
    library(clusterProfiler),
    library(DOSE),
    library(org.Mm.eg.db), 
    library(pathview),
    library(AnnotationDbi))
)
## Warning: package 'ensembldb' was built under R version 3.5.3
## Warning: package 'GenomicFeatures' was built under R version 3.5.3
##   [1] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##   [4] "BiocParallel"         "matrixStats"          "Biobase"             
##   [7] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [10] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [13] "stats4"               "stats"                "graphics"            
##  [16] "grDevices"            "utils"                "datasets"            
##  [19] "methods"              "base"                 "limma"               
##  [22] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [25] "BiocParallel"         "matrixStats"          "Biobase"             
##  [28] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [31] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [34] "stats4"               "stats"                "graphics"            
##  [37] "grDevices"            "utils"                "datasets"            
##  [40] "methods"              "base"                 "tximport"            
##  [43] "limma"                "DESeq2"               "SummarizedExperiment"
##  [46] "DelayedArray"         "BiocParallel"         "matrixStats"         
##  [49] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [52] "IRanges"              "S4Vectors"            "BiocGenerics"        
##  [55] "parallel"             "stats4"               "stats"               
##  [58] "graphics"             "grDevices"            "utils"               
##  [61] "datasets"             "methods"              "base"                
##  [64] "gridExtra"            "tximport"             "limma"               
##  [67] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [70] "BiocParallel"         "matrixStats"          "Biobase"             
##  [73] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [76] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [79] "stats4"               "stats"                "graphics"            
##  [82] "grDevices"            "utils"                "datasets"            
##  [85] "methods"              "base"                 "ensembldb"           
##  [88] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
##  [91] "gridExtra"            "tximport"             "limma"               
##  [94] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [97] "BiocParallel"         "matrixStats"          "Biobase"             
## [100] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [103] "S4Vectors"            "BiocGenerics"         "parallel"            
## [106] "stats4"               "stats"                "graphics"            
## [109] "grDevices"            "utils"                "datasets"            
## [112] "methods"              "base"                 "EnsDb.Mmusculus.v79" 
## [115] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [118] "AnnotationDbi"        "gridExtra"            "tximport"            
## [121] "limma"                "DESeq2"               "SummarizedExperiment"
## [124] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [127] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [130] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [133] "parallel"             "stats4"               "stats"               
## [136] "graphics"             "grDevices"            "utils"               
## [139] "datasets"             "methods"              "base"                
## [142] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [145] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [148] "gridExtra"            "tximport"             "limma"               
## [151] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [154] "BiocParallel"         "matrixStats"          "Biobase"             
## [157] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [160] "S4Vectors"            "BiocGenerics"         "parallel"            
## [163] "stats4"               "stats"                "graphics"            
## [166] "grDevices"            "utils"                "datasets"            
## [169] "methods"              "base"                 "ggplot2"             
## [172] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [175] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [178] "gridExtra"            "tximport"             "limma"               
## [181] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [184] "BiocParallel"         "matrixStats"          "Biobase"             
## [187] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [190] "S4Vectors"            "BiocGenerics"         "parallel"            
## [193] "stats4"               "stats"                "graphics"            
## [196] "grDevices"            "utils"                "datasets"            
## [199] "methods"              "base"                 "lattice"             
## [202] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [205] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [208] "AnnotationDbi"        "gridExtra"            "tximport"            
## [211] "limma"                "DESeq2"               "SummarizedExperiment"
## [214] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [217] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [220] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [223] "parallel"             "stats4"               "stats"               
## [226] "graphics"             "grDevices"            "utils"               
## [229] "datasets"             "methods"              "base"                
## [232] "reshape"              "lattice"              "ggplot2"             
## [235] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [238] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [241] "gridExtra"            "tximport"             "limma"               
## [244] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [247] "BiocParallel"         "matrixStats"          "Biobase"             
## [250] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [253] "S4Vectors"            "BiocGenerics"         "parallel"            
## [256] "stats4"               "stats"                "graphics"            
## [259] "grDevices"            "utils"                "datasets"            
## [262] "methods"              "base"                 "mixOmics"            
## [265] "MASS"                 "reshape"              "lattice"             
## [268] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [271] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [274] "AnnotationDbi"        "gridExtra"            "tximport"            
## [277] "limma"                "DESeq2"               "SummarizedExperiment"
## [280] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [283] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [286] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [289] "parallel"             "stats4"               "stats"               
## [292] "graphics"             "grDevices"            "utils"               
## [295] "datasets"             "methods"              "base"                
## [298] "gplots"               "mixOmics"             "MASS"                
## [301] "reshape"              "lattice"              "ggplot2"             
## [304] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [307] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [310] "gridExtra"            "tximport"             "limma"               
## [313] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [316] "BiocParallel"         "matrixStats"          "Biobase"             
## [319] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [322] "S4Vectors"            "BiocGenerics"         "parallel"            
## [325] "stats4"               "stats"                "graphics"            
## [328] "grDevices"            "utils"                "datasets"            
## [331] "methods"              "base"                 "RColorBrewer"        
## [334] "gplots"               "mixOmics"             "MASS"                
## [337] "reshape"              "lattice"              "ggplot2"             
## [340] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [343] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [346] "gridExtra"            "tximport"             "limma"               
## [349] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [352] "BiocParallel"         "matrixStats"          "Biobase"             
## [355] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [358] "S4Vectors"            "BiocGenerics"         "parallel"            
## [361] "stats4"               "stats"                "graphics"            
## [364] "grDevices"            "utils"                "datasets"            
## [367] "methods"              "base"                 "readr"               
## [370] "RColorBrewer"         "gplots"               "mixOmics"            
## [373] "MASS"                 "reshape"              "lattice"             
## [376] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [379] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [382] "AnnotationDbi"        "gridExtra"            "tximport"            
## [385] "limma"                "DESeq2"               "SummarizedExperiment"
## [388] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [391] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [394] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [397] "parallel"             "stats4"               "stats"               
## [400] "graphics"             "grDevices"            "utils"               
## [403] "datasets"             "methods"              "base"                
## [406] "dplyr"                "readr"                "RColorBrewer"        
## [409] "gplots"               "mixOmics"             "MASS"                
## [412] "reshape"              "lattice"              "ggplot2"             
## [415] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [418] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [421] "gridExtra"            "tximport"             "limma"               
## [424] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [427] "BiocParallel"         "matrixStats"          "Biobase"             
## [430] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [433] "S4Vectors"            "BiocGenerics"         "parallel"            
## [436] "stats4"               "stats"                "graphics"            
## [439] "grDevices"            "utils"                "datasets"            
## [442] "methods"              "base"                 "VennDiagram"         
## [445] "futile.logger"        "dplyr"                "readr"               
## [448] "RColorBrewer"         "gplots"               "mixOmics"            
## [451] "MASS"                 "reshape"              "lattice"             
## [454] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [457] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [460] "AnnotationDbi"        "gridExtra"            "tximport"            
## [463] "limma"                "DESeq2"               "SummarizedExperiment"
## [466] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [469] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [472] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [475] "parallel"             "stats4"               "stats"               
## [478] "graphics"             "grDevices"            "utils"               
## [481] "datasets"             "methods"              "base"                
## [484] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [487] "dplyr"                "readr"                "RColorBrewer"        
## [490] "gplots"               "mixOmics"             "MASS"                
## [493] "reshape"              "lattice"              "ggplot2"             
## [496] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [499] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [502] "gridExtra"            "tximport"             "limma"               
## [505] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [508] "BiocParallel"         "matrixStats"          "Biobase"             
## [511] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [514] "S4Vectors"            "BiocGenerics"         "parallel"            
## [517] "stats4"               "stats"                "graphics"            
## [520] "grDevices"            "utils"                "datasets"            
## [523] "methods"              "base"                 "DOSE"                
## [526] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [529] "dplyr"                "readr"                "RColorBrewer"        
## [532] "gplots"               "mixOmics"             "MASS"                
## [535] "reshape"              "lattice"              "ggplot2"             
## [538] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [541] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [544] "gridExtra"            "tximport"             "limma"               
## [547] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [550] "BiocParallel"         "matrixStats"          "Biobase"             
## [553] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [556] "S4Vectors"            "BiocGenerics"         "parallel"            
## [559] "stats4"               "stats"                "graphics"            
## [562] "grDevices"            "utils"                "datasets"            
## [565] "methods"              "base"                 "org.Mm.eg.db"        
## [568] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [571] "futile.logger"        "dplyr"                "readr"               
## [574] "RColorBrewer"         "gplots"               "mixOmics"            
## [577] "MASS"                 "reshape"              "lattice"             
## [580] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [583] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [586] "AnnotationDbi"        "gridExtra"            "tximport"            
## [589] "limma"                "DESeq2"               "SummarizedExperiment"
## [592] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [595] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [598] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [601] "parallel"             "stats4"               "stats"               
## [604] "graphics"             "grDevices"            "utils"               
## [607] "datasets"             "methods"              "base"                
## [610] "pathview"             "org.Hs.eg.db"         "org.Mm.eg.db"        
## [613] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [616] "futile.logger"        "dplyr"                "readr"               
## [619] "RColorBrewer"         "gplots"               "mixOmics"            
## [622] "MASS"                 "reshape"              "lattice"             
## [625] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [628] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [631] "AnnotationDbi"        "gridExtra"            "tximport"            
## [634] "limma"                "DESeq2"               "SummarizedExperiment"
## [637] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [640] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [643] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [646] "parallel"             "stats4"               "stats"               
## [649] "graphics"             "grDevices"            "utils"               
## [652] "datasets"             "methods"              "base"                
## [655] "pathview"             "org.Hs.eg.db"         "org.Mm.eg.db"        
## [658] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [661] "futile.logger"        "dplyr"                "readr"               
## [664] "RColorBrewer"         "gplots"               "mixOmics"            
## [667] "MASS"                 "reshape"              "lattice"             
## [670] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [673] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [676] "AnnotationDbi"        "gridExtra"            "tximport"            
## [679] "limma"                "DESeq2"               "SummarizedExperiment"
## [682] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [685] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [688] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [691] "parallel"             "stats4"               "stats"               
## [694] "graphics"             "grDevices"            "utils"               
## [697] "datasets"             "methods"              "base"

Analysis of all samples

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79 
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$tx_id))

# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Gene Counts/TCT")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
##  [1] "I_MKI1_3963_LIB037245_GEN00137855_R1.sf"   
##  [2] "I_MKI2_1711_LIB037245_GEN00137858_R1.sf"   
##  [3] "I_MKI3_3504_LIB041682_GEN00156317_R1.sf"   
##  [4] "I_MKI4_3501_LIB041682_GEN00156318_R1.sf"   
##  [5] "I_MKI5_3503_LIB041682_GEN00156319_R1.sf"   
##  [6] "I_MKI6_LS78_LIB044271_GEN00169444_R1.sf"   
##  [7] "I_V1_3999_LIB041682_GEN00156316_R1.sf"     
##  [8] "I_V2_KL3687_LIB044271_GEN00169448_R1.sf"   
##  [9] "I_V3_KL3693_LIB044271_GEN00169449_R1.sf"   
## [10] "I_V4_KL3823_LIB044271_GEN00169451_R1.sf"   
## [11] "I_V5_KL3912_LIB044271_GEN00169453_R1.sf"   
## [12] "I_V6_LS79_LIB044271_GEN00169445_R1.sf"     
## [13] "NI_MKI1_3976_LIB037245_GEN00137847_R1.sf"  
## [14] "NI_MKI2_3960_LIB037245_GEN00137848_R1.sf"  
## [15] "NI_MKI3_3561_LIB037245_GEN00137849_R1.sf"  
## [16] "NI_MKI4_3462_LIB037245_GEN00137850_R1.sf"  
## [17] "NI_MKI5_3505_LIB041682_GEN00156321_R1.sf"  
## [18] "NI_MKI6_KL3695_LIB044271_GEN00169450_R1.sf"
## [19] "NI_V1_3974_LIB037245_GEN00137843_R1.sf"    
## [20] "NI_V2_3975_LIB037245_GEN00137844_R1.sf"    
## [21] "NI_V3_3962_LIB037245_GEN00137845_R1.sf"    
## [22] "NI_V4_3461_LIB037245_GEN00137846_R1.sf"    
## [23] "NI_V5_3914_LIB041682_GEN00156320_R1.sf"    
## [24] "NI_V6_KL3688_LIB044271_GEN00169446_R1.sf"  
## [25] "NI_V7_KL3832_LIB044271_GEN00169452_R1.sf"
names(countFiles) <- c('I_MKI_1','I_MKI_2','I_MKI_3','I_MKI_4','I_MKI_5','I_MKI_6', 'I_V_1', 'I_V_2', 'I_V_3', 'I_V_4', 'I_V_5', 'I_V_6', 'NI_MKI_1','NI_MKI_2','NI_MKI_3','NI_MKI_4', 'NI_MKI_5', 'NI_MKI_6', 'NI_V_1', 'NI_V_2', 'NI_V_3', 'NI_V_4', 'NI_V_5', 'NI_V_6', 'NI_V_7')

txi.salmon <- tximport(countFiles, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all <- data.frame(condition = c(rep('experimental', 12), rep('control', 13)))

DESeqsampletable_all$batch <- factor(c('1','1','3','3','3','4','3','4','4','4','4','4','1','1','1','1','3','4','1','1','1','1','3','4','4'))
DESeqsampletable_all$gender <- factor(c('F','F','M','F','M','F','F','M','F','M','F','M','F','F','F','M','M','M','F','F','F','F','M','M','M'))

rownames(DESeqsampletable_all) <- colnames(txi.salmon$counts)

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable_all, ~ batch + gender + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 2697 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## additional priors are available via the 'type' argument, see ?lfcShrink for details

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Quality Inspection of the Gene Count Data

Generate raw count table that contains the simple counts of each gene

Data is transformed and pseudocount is calculated.

rawCountTable <- as.data.frame(counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
  ggplot(pseudoCount, aes(x= I_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_1"), 
  ggplot(pseudoCount, aes(x= I_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_2"),
  ggplot(pseudoCount, aes(x= I_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_3"),
  ggplot(pseudoCount, aes(x= I_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_4"),
  ggplot(pseudoCount, aes(x= I_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_5"),
  ggplot(pseudoCount, aes(x= I_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_6"), 
  nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= I_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_1"),
  ggplot(pseudoCount, aes(x= I_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_2"),
  ggplot(pseudoCount, aes(x= I_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_3"),
  ggplot(pseudoCount, aes(x= I_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_4"),
  ggplot(pseudoCount, aes(x= I_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_5"),
  ggplot(pseudoCount, aes(x= I_V_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_6"),
  nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= NI_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_1"),
  ggplot(pseudoCount, aes(x= NI_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_2"),
  ggplot(pseudoCount, aes(x= NI_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_3"),
  ggplot(pseudoCount, aes(x= NI_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_4"),
  ggplot(pseudoCount, aes(x= NI_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_5"),
  ggplot(pseudoCount, aes(x= NI_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_6"),
  nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= NI_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_1"),
  ggplot(pseudoCount, aes(x= NI_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_2"),
  ggplot(pseudoCount, aes(x= NI_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_3"),
  ggplot(pseudoCount, aes(x= NI_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_4"),
  ggplot(pseudoCount, aes(x= NI_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_5"), 
  ggplot(pseudoCount, aes(x= NI_V_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_6"),
  ggplot(pseudoCount, aes(x= NI_V_7)) + xlab(expression(log[2](count + 1))) + ylab("Number of Transcripts") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_7"),
  nrow = 3)

Between-sample distribution

Check on the gene count distribution across all genes.

#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,4))

ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3", "#E69F00", "#FF0000")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab(expression(log[2](count+1)))

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_by_transcript.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-75, 55) + ylim(-45, 45)

I_V vs NI_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[19:25], countFiles[7:12])

countFiles_vcompare <- countFiles_vcompare[c(-3,-5)]

txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 7:12), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5),]

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## additional priors are available via the 'type' argument, see ?lfcShrink for details

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_V vs NI_V_transcript_level.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-100, 79) + ylim(-65, 45)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- rowMeans(rawCountTable[,1:5]) > 0 | rowMeans(rawCountTable[,6:11]) > 0
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V_transcript_level.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V_transcript_level.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_V samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}

my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(sig_dif[,7:17])

png('I_V vs NI_V RNASeq_transcript_level.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_V vs NI_V RNASeq",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Transcripts",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
  xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_V vs NI_V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot")
## Warning: Removed 503 rows containing missing values (geom_point).

I_MKI vs I_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:12], countFiles[1:6])

txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:12, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 6), rep('experimental', 6)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## additional priors are available via the 'type' argument, see ?lfcShrink for details

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_MKI vs I_V_transcript_level.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-75, 100) + ylim(-75, 50)

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

I’m applying a different filtering criteria here. After discussing with Lucia, we decided to only look at genes that are expressed in 3 or more samples in each group.

keep <- rowMeans(rawCountTable[,1:6]) > 0 | rowMeans(rawCountTable[,7:12]) > 0
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V_transcript_level.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V_transcript_level.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_MKI samples compared to I_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

Here I relax the padj cutoff to 0.1

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:18] <- zscore(as.numeric(sig_dif[i,7:18]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:18])

png('I_MKI vs I_V RNASeq_transcript_level.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_MKI vs I_V RNASeq",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "both",
          labRow = FALSE,
          keysize = 2,
          ylab = "Transcripts",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,7:12])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:6])
ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
  xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_MKI vs I_V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot")
## Warning: Removed 742 rows containing missing values (geom_point).

NI_MKI vs NI_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_nicompare <- c(countFiles[19:25], countFiles[13:18])
countFiles_nicompare <- countFiles_nicompare[c(-3,-5)]

txi.salmon <- tximport(countFiles_nicompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 13:18), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5),]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ batch + condition + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## additional priors are available via the 'type' argument, see ?lfcShrink for details

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_NI_MKI vs NI_V_transcript_level.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) +
  xlim(-65, 60) + ylim(-40, 40)

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- rowMeans(rawCountTable[,1:5]) > 0 | rowMeans(rawCountTable[,6:11]) > 0
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_NI_MKI vs NI_V_transcript_level.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_NI_MKI vs NI_V_transcript_level.csv")

Draw heatmap for transcripts that are significantly dysregulated in NI_MKI samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

Here I relaxed the padj cutoff to 0.1

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:17])

png('NI_MKI vs NI_V RNASeq_transcript_level.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "NI_MKI vs NI_V RNASeq",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,20),
          trace = "none",
          dendrogram = "both",
          labRow = rownames(heatmap_matrix),
          keysize = 2,
          ylab = "Transcripts",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$NI_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
ggplot(dif_analysis, aes(x = NI_V_mean, y = NI_MKI_mean)) +
  xlab("NI_V_Average(log2)") + ylab("NI_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "NI_MKI vs NI_V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Volcano Plot")
## Warning: Removed 609 rows containing missing values (geom_point).